***** Anomaly Detection using Isolation Forest *****

Loading Libraries

In [57]:
import datetime
import pandas as pd
import requests
import matplotlib as mpl
import os
import plotly.express as px
import numpy as np
from sklearn.ensemble import IsolationForest
from fbprophet import Prophet
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.plotly as py
import matplotlib.pyplot as plt
from matplotlib import pyplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import ppscore as pps
import seaborn as sns
import datasist as ds
from autoviz.AutoViz_Class import AutoViz_Class
from sklearn.preprocessing import OrdinalEncoder
from fancyimpute import KNN
import warnings
warnings.simplefilter('ignore')

mpl.rcParams['figure.figsize'] = (10,8)
mpl.rcParams['axes.grid'] = False


print("Libraries imported succesfully")
Libraries imported succesfully

Loading the dataset from the given csv file

In [2]:
df_ads = pd.read_csv('ads_challenge.csv')
display(df_ads)
Date Country ad_type1_impressions ad_type1_CTR ad_type2_impressions ad_type2_videos_completed ad_type2_CTR
0 2019-01-02 Albania 23,962 1.47% 12,900 97.59% 1.37%
1 2019-01-02 Algeria 50,643 1.63% 35,458 97.77% 1.45%
2 2019-01-02 Argentina 760,871 0.65% 1,006,527 98.03% 0.47%
3 2019-01-02 Armenia 22,796 1.25% NaN NaN NaN
4 2019-01-02 Australia 407,314 0.58% 1,290,808 98.50% 0.48%
... ... ... ... ... ... ... ...
13212 2019-04-29 Macao NaN NaN 12,032 0.00% 0.66%
13213 2019-04-29 Malta NaN NaN 31,695 0.00% 0.72%
13214 2019-04-29 Palestine NaN NaN 33,792 0.00% 0.52%
13215 2019-04-29 Syria NaN NaN 14,035 0.00% 3.70%
13216 2019-04-29 Uzbekistan NaN NaN 13,698 0.00% 2.06%

13217 rows × 7 columns

In [3]:
# showing data types of the features
df_ads.dtypes
Out[3]:
Date                         object
Country                      object
ad_type1_impressions         object
ad_type1_CTR                 object
ad_type2_impressions         object
ad_type2_videos_completed    object
ad_type2_CTR                 object
dtype: object
In [4]:
# converting data datatype to datetime
df_ads.Date = pd.to_datetime(df_ads['Date'])
df_ads.dtypes
Out[4]:
Date                         datetime64[ns]
Country                              object
ad_type1_impressions                 object
ad_type1_CTR                         object
ad_type2_impressions                 object
ad_type2_videos_completed            object
ad_type2_CTR                         object
dtype: object

Exploratory Data Analysis

In [5]:
# showing information of the data type, You can also use df.summary() for the detailed summary.
df_ads.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13217 entries, 0 to 13216
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       13217 non-null  datetime64[ns]
 1   Country                    13202 non-null  object        
 2   ad_type1_impressions       5364 non-null   object        
 3   ad_type1_CTR               5364 non-null   object        
 4   ad_type2_impressions       13035 non-null  object        
 5   ad_type2_videos_completed  13035 non-null  object        
 6   ad_type2_CTR               13035 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 722.9+ KB
In [6]:
# Showing Missing Values
df_ads.isnull().sum()/len(df_ads)*100
Out[6]:
Date                          0.000000
Country                       0.113490
ad_type1_impressions         59.415904
ad_type1_CTR                 59.415904
ad_type2_impressions          1.377014
ad_type2_videos_completed     1.377014
ad_type2_CTR                  1.377014
dtype: float64
In [7]:
# Shape of the dataset
df_ads.shape
Out[7]:
(13217, 7)
In [8]:
# converting object data type to float
df_ads['ad_type1_impressions'] = df_ads['ad_type1_impressions'].str.replace(',','').astype('float')
df_ads['ad_type2_impressions'] = df_ads['ad_type2_impressions'].str.replace(',','').astype('float')
df_ads['ad_type1_CTR'] = df_ads['ad_type1_CTR'].str.replace('%','').astype('float') * 10**(-2)
df_ads['ad_type2_CTR'] = df_ads['ad_type2_CTR'].str.replace('%','').astype('float')* 10**(-2)
df_ads['ad_type2_videos_completed'] = df_ads['ad_type2_videos_completed'].str.replace('%','').astype('float')* 10**(-2)
df_ads.head()
Out[8]:
Date Country ad_type1_impressions ad_type1_CTR ad_type2_impressions ad_type2_videos_completed ad_type2_CTR
0 2019-01-02 Albania 23962.0 0.0147 12900.0 0.9759 0.0137
1 2019-01-02 Algeria 50643.0 0.0163 35458.0 0.9777 0.0145
2 2019-01-02 Argentina 760871.0 0.0065 1006527.0 0.9803 0.0047
3 2019-01-02 Armenia 22796.0 0.0125 NaN NaN NaN
4 2019-01-02 Australia 407314.0 0.0058 1290808.0 0.9850 0.0048

Feature Importance using PPS Score

In [9]:
# PPS- score of the variables
pps.predictors(df_ads, "ad_type2_CTR")
Out[9]:
x y ppscore case is_valid_score metric baseline_score model_score model
0 Country ad_type2_CTR 0.594429 regression True mean absolute error 0.004199 0.001703 DecisionTreeRegressor()
1 ad_type1_CTR ad_type2_CTR 0.027061 regression True mean absolute error 0.003892 0.003786 DecisionTreeRegressor()
2 Date ad_type2_CTR 0.000000 regression True mean absolute error 0.004239 0.004637 DecisionTreeRegressor()
3 ad_type1_impressions ad_type2_CTR 0.000000 regression True mean absolute error 0.003892 0.005494 DecisionTreeRegressor()
4 ad_type2_impressions ad_type2_CTR 0.000000 regression True mean absolute error 0.004239 0.005016 DecisionTreeRegressor()
5 ad_type2_videos_completed ad_type2_CTR 0.000000 regression True mean absolute error 0.004239 0.004947 DecisionTreeRegressor()
In [10]:
# Plotting Feature Importance
plt.figure(figsize=(20,3))
df_predictors = pps.predictors(df_ads, y="ad_type2_CTR")
display(sns.barplot(data=df_predictors, x="x", y="ppscore"))
<AxesSubplot:xlabel='x', ylabel='ppscore'>
In [11]:
# creating a dataframe with the important features selected. 
#For now I am selecting impressions as well as type 2 videos completed.
# We can omit them as their importance is not significant but this decision can be taken based on doman expertise knowledge 
# to improve the performance further. For now I am keeping these variables.
ctr_video = df_ads[['Date','Country','ad_type2_impressions','ad_type2_videos_completed','ad_type2_CTR','ad_type1_CTR']]
In [12]:
# Checking for value where type2 CTR and videos completed both are null.
ctr_video[(ctr_video.ad_type2_CTR.isna()) &(~ctr_video.ad_type2_videos_completed.isna())]
Out[12]:
Date Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
In [13]:
# checking for data where country data is not provided
ctr_video[ctr_video.Country.isna()]
Out[13]:
Date Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
3824 2019-02-07 NaN 13347.0 0.0000 0.0137 NaN
4169 2019-02-10 NaN 13926.0 0.0000 0.0101 NaN
4287 2019-02-11 NaN 14928.0 0.0000 0.0113 NaN
4521 2019-02-13 NaN 29022.0 0.0000 0.0085 NaN
4644 2019-02-14 NaN 13819.0 0.0000 0.0147 NaN
5468 2019-02-21 NaN 33157.0 0.0000 0.0052 NaN
5590 2019-02-22 NaN 29959.0 0.0000 0.0058 NaN
5714 2019-02-23 NaN 28713.0 0.0000 0.0043 NaN
5838 2019-02-24 NaN 30073.0 0.0000 0.0045 NaN
5961 2019-02-25 NaN 31432.0 0.0000 0.0056 NaN
6084 2019-02-26 NaN 31940.0 0.0000 0.0050 NaN
6207 2019-02-27 NaN 32039.0 0.0000 0.0051 NaN
6328 2019-02-28 NaN 13138.0 0.0000 0.0070 NaN
12012 2019-04-18 NaN 16682.0 0.8815 0.0053 NaN
12120 2019-04-19 NaN 14322.0 0.8817 0.0072 NaN
In [14]:
# Deleting(dropping) rows where impressions and videos completed are null and CTR is null
ctr_video.dropna(subset=['Date','ad_type2_impressions','ad_type2_videos_completed','ad_type2_CTR'], inplace = True, how = 'all')
ctr_video[ctr_video.ad_type2_CTR.isna()]
Out[14]:
Date Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
3 2019-01-02 Armenia NaN NaN NaN 0.0125
42 2019-01-02 Kosovo NaN NaN NaN 0.0134
44 2019-01-02 Laos NaN NaN NaN 0.0181
50 2019-01-02 Macedonia NaN NaN NaN 0.0079
53 2019-01-02 Mongolia NaN NaN NaN 0.0209
... ... ... ... ... ... ...
13117 2019-04-29 Bosnia and Herzegovina NaN NaN NaN 0.0065
13121 2019-04-29 Cameroon NaN NaN NaN 0.0196
13158 2019-04-29 Laos NaN NaN NaN 0.0153
13162 2019-04-29 Macedonia NaN NaN NaN 0.0097
13166 2019-04-29 Mongolia NaN NaN NaN 0.0161

182 rows × 6 columns

In [15]:
# setting Date as the Index
ctr_video = ctr_video.set_index('Date')

Visualizing Data

In [16]:
# Daily CTR Trend of United States 
fig = px.line(df_ads[df_ads.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type1_CTR','ad_type1_impressions'],title = 'Type-1 CTR')

fig.update_xaxes(
    rangeslider_visible= True,
    rangeselector=dict(
                        buttons = list([
                        dict(count = 1,label = '1m',step='month',stepmode = "backward"),
                        dict(count = 2,label = '6m',step='month',stepmode = "backward"),
                        dict(count = 3,label = '12m',step='month',stepmode = "todate"),
                        dict(step= 'all')
                            ])        
                        )
                   )
fig.show()

As you can see that the data is entirely missing from Jan 9 till Feb 14th for both impressions and CTR for ads group-1 There can be multiple reasons for it

1) Data was not gathered/collected or measured due to some technical issue.

2) There was no data - no ads were shown during that period.

In order to solve the above cases , I am making as assumption that due to covid start - no ads was shown during that period for group 1 therefore we don't have datapoints avaialble for that period of time. There are multiple ways to deal with it .for the sake of simplicity for now, I am replacing these values with 0.

In [17]:
# Replacing null value with 0
ctr_video['ad_type1_CTR'] = ctr_video['ad_type1_CTR'].fillna(0)
In [18]:
# Visualization the trend of United stated again after replacing null with 0
fig = px.line(ctr_video[ctr_video.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type1_CTR'],title = 'Type-1 CTR')

fig.update_xaxes(
    rangeslider_visible= True,
    rangeselector=dict(
                        buttons = list([
                        dict(count = 1,label = '1m',step='month',stepmode = "backward"),
                        dict(count = 2,label = '6m',step='month',stepmode = "backward"),
                        dict(count = 3,label = '12m',step='month',stepmode = "todate"),
                        dict(step= 'all')
                            ])        
                        )
                   )
fig.show()

Some more visualizations showing the data distribution

In [19]:
ds.visualizations.histogram(ctr_video)
In [20]:
num_features = ctr_video.select_dtypes(include=['int64', 'float64']).columns
ds.visualizations.scatterplot(ctr_video,num_features=num_features,target='ad_type2_CTR')
In [21]:
ds.visualizations.autoviz(ctr_video)
Shape of your Data Set: (13217, 5)
Classifying variables in data set...
    5 Predictors classified...
        This does not include the Target column(s)
    1 variables removed since they were ID or low-information variables
Number of All Scatter Plots = 10
No categorical or boolean vars in data set. Hence no bar charts.
Nothing to add Plot not being added
Time to run AutoViz (in seconds) = 2.027

Imputing Missing values using KNN imputer

In [22]:
# Finding numerical and categorical features
num_features = ctr_video.select_dtypes(include=['int64', 'float64']).columns
cat_features = ctr_video.select_dtypes(include=['object','category']).columns

# creating a dictionary for ordinal encoding -- > Doing ordinal encoding before KNN for object datatype
ordinal_enc_dict = {}

for col_name in ctr_video[cat_features]:
    # Create Ordinal encoder for col
    ordinal_enc_dict[col_name] = OrdinalEncoder()
    col = ctr_video[col_name]
    
    # Select non-null values of col
    col_not_null = col[col.notnull()]
    reshaped_vals = col_not_null.values.reshape(-1, 1)
    encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
    
    # Store the values to non-null values of the column
    ctr_video.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
# KNN imputer for imputin missing values using KNN
KNN_imputer = KNN()

# Impute the DataFrame
ctr_video.iloc[:, :] = KNN_imputer.fit_transform(ctr_video)
ctr_video.head()
Imputing row 1/13217 with 0 missing, elapsed time: 48.621
Imputing row 101/13217 with 0 missing, elapsed time: 48.628
Imputing row 201/13217 with 0 missing, elapsed time: 48.630
Imputing row 301/13217 with 0 missing, elapsed time: 48.630
Imputing row 401/13217 with 0 missing, elapsed time: 48.636
Imputing row 501/13217 with 3 missing, elapsed time: 48.638
Imputing row 601/13217 with 0 missing, elapsed time: 48.640
Imputing row 701/13217 with 0 missing, elapsed time: 48.642
Imputing row 801/13217 with 0 missing, elapsed time: 48.642
Imputing row 901/13217 with 0 missing, elapsed time: 48.642
Imputing row 1001/13217 with 0 missing, elapsed time: 48.642
Imputing row 1101/13217 with 0 missing, elapsed time: 48.647
Imputing row 1201/13217 with 0 missing, elapsed time: 48.647
Imputing row 1301/13217 with 0 missing, elapsed time: 48.647
Imputing row 1401/13217 with 0 missing, elapsed time: 48.649
Imputing row 1501/13217 with 0 missing, elapsed time: 48.649
Imputing row 1601/13217 with 0 missing, elapsed time: 48.649
Imputing row 1701/13217 with 0 missing, elapsed time: 48.650
Imputing row 1801/13217 with 0 missing, elapsed time: 48.651
Imputing row 1901/13217 with 0 missing, elapsed time: 48.651
Imputing row 2001/13217 with 0 missing, elapsed time: 48.652
Imputing row 2101/13217 with 0 missing, elapsed time: 48.653
Imputing row 2201/13217 with 0 missing, elapsed time: 48.653
Imputing row 2301/13217 with 0 missing, elapsed time: 48.654
Imputing row 2401/13217 with 0 missing, elapsed time: 48.654
Imputing row 2501/13217 with 0 missing, elapsed time: 48.654
Imputing row 2601/13217 with 0 missing, elapsed time: 48.654
Imputing row 2701/13217 with 0 missing, elapsed time: 48.654
Imputing row 2801/13217 with 0 missing, elapsed time: 48.657
Imputing row 2901/13217 with 0 missing, elapsed time: 48.657
Imputing row 3001/13217 with 0 missing, elapsed time: 48.657
Imputing row 3101/13217 with 0 missing, elapsed time: 48.659
Imputing row 3201/13217 with 0 missing, elapsed time: 48.659
Imputing row 3301/13217 with 0 missing, elapsed time: 48.660
Imputing row 3401/13217 with 0 missing, elapsed time: 48.660
Imputing row 3501/13217 with 0 missing, elapsed time: 48.660
Imputing row 3601/13217 with 0 missing, elapsed time: 48.660
Imputing row 3701/13217 with 0 missing, elapsed time: 48.660
Imputing row 3801/13217 with 0 missing, elapsed time: 48.660
Imputing row 3901/13217 with 0 missing, elapsed time: 48.660
Imputing row 4001/13217 with 0 missing, elapsed time: 48.660
Imputing row 4101/13217 with 0 missing, elapsed time: 48.660
Imputing row 4201/13217 with 0 missing, elapsed time: 48.667
Imputing row 4301/13217 with 0 missing, elapsed time: 48.667
Imputing row 4401/13217 with 0 missing, elapsed time: 48.669
Imputing row 4501/13217 with 0 missing, elapsed time: 48.669
Imputing row 4601/13217 with 0 missing, elapsed time: 48.670
Imputing row 4701/13217 with 0 missing, elapsed time: 48.670
Imputing row 4801/13217 with 0 missing, elapsed time: 48.672
Imputing row 4901/13217 with 0 missing, elapsed time: 48.672
Imputing row 5001/13217 with 0 missing, elapsed time: 48.673
Imputing row 5101/13217 with 0 missing, elapsed time: 48.673
Imputing row 5201/13217 with 0 missing, elapsed time: 48.673
Imputing row 5301/13217 with 0 missing, elapsed time: 48.673
Imputing row 5401/13217 with 0 missing, elapsed time: 48.673
Imputing row 5501/13217 with 0 missing, elapsed time: 48.673
Imputing row 5601/13217 with 0 missing, elapsed time: 48.677
Imputing row 5701/13217 with 0 missing, elapsed time: 48.677
Imputing row 5801/13217 with 0 missing, elapsed time: 48.678
Imputing row 5901/13217 with 0 missing, elapsed time: 48.680
Imputing row 6001/13217 with 0 missing, elapsed time: 48.680
Imputing row 6101/13217 with 0 missing, elapsed time: 48.680
Imputing row 6201/13217 with 0 missing, elapsed time: 48.680
Imputing row 6301/13217 with 0 missing, elapsed time: 48.680
Imputing row 6401/13217 with 0 missing, elapsed time: 48.680
Imputing row 6501/13217 with 0 missing, elapsed time: 48.686
Imputing row 6601/13217 with 0 missing, elapsed time: 48.688
Imputing row 6701/13217 with 0 missing, elapsed time: 48.689
Imputing row 6801/13217 with 0 missing, elapsed time: 48.689
Imputing row 6901/13217 with 0 missing, elapsed time: 48.690
Imputing row 7001/13217 with 0 missing, elapsed time: 48.690
Imputing row 7101/13217 with 0 missing, elapsed time: 48.690
Imputing row 7201/13217 with 0 missing, elapsed time: 48.690
Imputing row 7301/13217 with 0 missing, elapsed time: 48.690
Imputing row 7401/13217 with 0 missing, elapsed time: 48.690
Imputing row 7501/13217 with 0 missing, elapsed time: 48.690
Imputing row 7601/13217 with 0 missing, elapsed time: 48.690
Imputing row 7701/13217 with 0 missing, elapsed time: 48.690
Imputing row 7801/13217 with 0 missing, elapsed time: 48.690
Imputing row 7901/13217 with 0 missing, elapsed time: 48.690
Imputing row 8001/13217 with 0 missing, elapsed time: 48.690
Imputing row 8101/13217 with 0 missing, elapsed time: 48.698
Imputing row 8201/13217 with 0 missing, elapsed time: 48.699
Imputing row 8301/13217 with 0 missing, elapsed time: 48.700
Imputing row 8401/13217 with 0 missing, elapsed time: 48.701
Imputing row 8501/13217 with 0 missing, elapsed time: 48.701
Imputing row 8601/13217 with 0 missing, elapsed time: 48.701
Imputing row 8701/13217 with 0 missing, elapsed time: 48.701
Imputing row 8801/13217 with 0 missing, elapsed time: 48.701
Imputing row 8901/13217 with 0 missing, elapsed time: 48.701
Imputing row 9001/13217 with 0 missing, elapsed time: 48.701
Imputing row 9101/13217 with 0 missing, elapsed time: 48.701
Imputing row 9201/13217 with 0 missing, elapsed time: 48.701
Imputing row 9301/13217 with 0 missing, elapsed time: 48.701
Imputing row 9401/13217 with 0 missing, elapsed time: 48.701
Imputing row 9501/13217 with 0 missing, elapsed time: 48.701
Imputing row 9601/13217 with 0 missing, elapsed time: 48.708
Imputing row 9701/13217 with 0 missing, elapsed time: 48.709
Imputing row 9801/13217 with 0 missing, elapsed time: 48.709
Imputing row 9901/13217 with 0 missing, elapsed time: 48.710
Imputing row 10001/13217 with 0 missing, elapsed time: 48.710
Imputing row 10101/13217 with 0 missing, elapsed time: 48.711
Imputing row 10201/13217 with 0 missing, elapsed time: 48.711
Imputing row 10301/13217 with 0 missing, elapsed time: 48.711
Imputing row 10401/13217 with 0 missing, elapsed time: 48.711
Imputing row 10501/13217 with 0 missing, elapsed time: 48.711
Imputing row 10601/13217 with 0 missing, elapsed time: 48.711
Imputing row 10701/13217 with 0 missing, elapsed time: 48.711
Imputing row 10801/13217 with 0 missing, elapsed time: 48.711
Imputing row 10901/13217 with 0 missing, elapsed time: 48.718
Imputing row 11001/13217 with 0 missing, elapsed time: 48.718
Imputing row 11101/13217 with 0 missing, elapsed time: 48.733
Imputing row 11201/13217 with 0 missing, elapsed time: 48.736
Imputing row 11301/13217 with 0 missing, elapsed time: 48.740
Imputing row 11401/13217 with 0 missing, elapsed time: 48.741
Imputing row 11501/13217 with 0 missing, elapsed time: 48.745
Imputing row 11601/13217 with 0 missing, elapsed time: 48.745
Imputing row 11701/13217 with 3 missing, elapsed time: 48.751
Imputing row 11801/13217 with 0 missing, elapsed time: 48.754
Imputing row 11901/13217 with 0 missing, elapsed time: 48.759
Imputing row 12001/13217 with 0 missing, elapsed time: 48.762
Imputing row 12101/13217 with 0 missing, elapsed time: 48.767
Imputing row 12201/13217 with 0 missing, elapsed time: 48.772
Imputing row 12301/13217 with 0 missing, elapsed time: 48.772
Imputing row 12401/13217 with 0 missing, elapsed time: 48.782
Imputing row 12501/13217 with 0 missing, elapsed time: 48.787
Imputing row 12601/13217 with 0 missing, elapsed time: 48.790
Imputing row 12701/13217 with 0 missing, elapsed time: 48.794
Imputing row 12801/13217 with 0 missing, elapsed time: 48.800
Imputing row 12901/13217 with 0 missing, elapsed time: 48.803
Imputing row 13001/13217 with 0 missing, elapsed time: 48.803
Imputing row 13101/13217 with 0 missing, elapsed time: 48.813
Imputing row 13201/13217 with 0 missing, elapsed time: 48.813
Out[22]:
Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
Date
2019-01-02 0.0 12900.0 0.97590 0.01370 0.0147
2019-01-02 1.0 35458.0 0.97770 0.01450 0.0163
2019-01-02 2.0 1006527.0 0.98030 0.00470 0.0065
2019-01-02 3.0 25316.6 0.78624 0.01278 0.0125
2019-01-02 5.0 1290808.0 0.98500 0.00480 0.0058
In [23]:
# Loop over the column names
for col_name in cat_features:
    
    # Reshape the data
    reshaped = ctr_video[col_name].values.reshape(-1, 1)
    # Perform inverse transform of the ordinally encoded columns
    ctr_video[col_name] = ordinal_enc_dict[col_name].inverse_transform(reshaped)
ctr_video.head()
Out[23]:
Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
Date
2019-01-02 Albania 12900.0 0.97590 0.01370 0.0147
2019-01-02 Algeria 35458.0 0.97770 0.01450 0.0163
2019-01-02 Argentina 1006527.0 0.98030 0.00470 0.0065
2019-01-02 Armenia 25316.6 0.78624 0.01278 0.0125
2019-01-02 Australia 1290808.0 0.98500 0.00480 0.0058

Scaling the data using Min Max scaler

In [24]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

scaler = MinMaxScaler()
numeric_features = ['ad_type2_impressions','ad_type2_videos_completed','ad_type1_CTR','ad_type2_CTR']
# numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
# df2 = pd.DataFrame(scaler.fit_transform(df),
#                    columns=['ad_type1_impressions','ad_type1_CTR','ad_type2_impressions','ad_type2_videos_completed','ad_type2_CTR'],
#                    index = ['Date'])
ctr_video[numeric_features] = scaler.fit_transform(ctr_video[numeric_features])
ctr_video.head()
Out[24]:
Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
Date
2019-01-02 Albania 0.000033 0.927133 0.150901 0.155063
2019-01-02 Algeria 0.000863 0.928843 0.159910 0.171941
2019-01-02 Argentina 0.036617 0.931313 0.049550 0.068565
2019-01-02 Armenia 0.000490 0.746950 0.140541 0.131857
2019-01-02 Australia 0.047084 0.935778 0.050676 0.061181

Visualizing the trend of United states for Impressions, videos completed and CTR

In [25]:
fig = px.line(ctr_video[ctr_video.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type2_impressions','ad_type2_CTR', 'ad_type1_CTR'],title = 'Impressions Vs Click Through Rate')

fig.update_xaxes(
    rangeslider_visible= True,
    rangeselector=dict(
                        buttons = list([
                        dict(count = 1,label = '1m',step='month',stepmode = "backward"),
                        dict(count = 2,label = '6m',step='month',stepmode = "backward"),
                        dict(count = 3,label = '12m',step='month',stepmode = "todate"),
                        dict(step= 'all')
                            ])        
                        )
                   )
fig.show()
In [26]:
fig = px.line(ctr_video[ctr_video.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type2_impressions','ad_type2_CTR'],title = 'Impressions Vs Click Through Rate')

fig.update_xaxes(
    rangeslider_visible= True,
    rangeselector=dict(
                        buttons = list([
                        dict(count = 1,label = '1m',step='month',stepmode = "backward"),
                        dict(count = 2,label = '6m',step='month',stepmode = "backward"),
                        dict(count = 3,label = '12m',step='month',stepmode = "todate"),
                        dict(step= 'all')
                            ])        
                        )
                   )
fig.show()
In [27]:
fig = px.line(ctr_video[ctr_video.Country=="India"].reset_index(), x = 'Date',y = ['ad_type2_CTR','ad_type2_impressions'],title = 'Impressions Vs Click Through Rate')

fig.update_xaxes(
    rangeslider_visible= True,
    rangeselector=dict(
                        buttons = list([
                        dict(count = 1,label = '1m',step='month',stepmode = "backward"),
                        dict(count = 2,label = '6m',step='month',stepmode = "backward"),
                        dict(count = 3,label = '12m',step='month',stepmode = "todate"),
                        dict(step= 'all')
                            ])        
                        )
                   )
fig.show()
In [28]:
ctr_video.query("Country=='United States'")[['ad_type2_impressions','ad_type2_CTR','ad_type2_videos_completed']].plot()
Out[28]:
<AxesSubplot:xlabel='Date'>
In [29]:
ctr_video[['Country', 'ad_type2_impressions', 'ad_type2_videos_completed',
       'ad_type2_CTR','ad_type1_CTR']].head()
Out[29]:
Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
Date
2019-01-02 Albania 0.000033 0.927133 0.150901 0.155063
2019-01-02 Algeria 0.000863 0.928843 0.159910 0.171941
2019-01-02 Argentina 0.036617 0.931313 0.049550 0.068565
2019-01-02 Armenia 0.000490 0.746950 0.140541 0.131857
2019-01-02 Australia 0.047084 0.935778 0.050676 0.061181
In [30]:
# Unique country names list
ctr_video['Country'].unique()
Out[30]:
array(['Albania', 'Algeria', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belgium',
       'Bolivia', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria',
       'Cambodia', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Cyprus',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia',
       'France', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala',
       'Honduras', 'Hong Kong', 'India', 'Indonesia', 'Iraq', 'Ireland',
       'Israel', 'Italy', 'Jamaica', 'Jordan', 'Kenya', 'Kosovo',
       'Kuwait', 'Laos', 'Latvia', 'Lebanon', 'Lithuania', 'Luxembourg',
       'Macao', 'Macedonia', 'Malaysia', 'Mexico', 'Mongolia', 'Morocco',
       'Myanmar (Burma)', 'Nepal', 'Netherlands', 'New Zealand',
       'Nicaragua', 'Nigeria', 'Oman', 'Pakistan', 'Palestine', 'Panama',
       'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal',
       'Puerto Rico', 'Qatar', 'Romania', 'Saudi Arabia', 'Serbia',
       'Singapore', 'Slovenia', 'South Africa', 'Spain', 'Sri Lanka',
       'Sweden', 'Switzerland', 'Taiwan', 'Thailand',
       'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Ukraine',
       'United Arab Emirates', 'United Kingdom', 'United States',
       'Uruguay', 'Venezuela', 'Vietnam', 'China', 'Croatia',
       'Czech Republic', 'Denmark', 'Finland', 'Hungary', 'Iceland',
       'Japan', 'Korea, South', 'Malta', 'Norway', 'Russia', 'Slovakia',
       'Syria', 'Tanzania', 'Unknown', 'Kazakhstan', 'Moldova', 'Reunion',
       'Zambia', 'Belarus', 'Guadeloupe', 'Martinique', 'Namibia',
       "Cote d'Ivoire", 'Mauritius', 'Cameroon', 'Senegal',
       'Bahamas, The', 'Mozambique', 'Zimbabwe', 'Uganda', 'Uzbekistan',
       'Aruba', 'Guam'], dtype=object)
In [31]:
# resetting the index
ctr_video = ctr_video.reset_index()
ctr_video.head()
Out[31]:
Date Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
0 2019-01-02 Albania 0.000033 0.927133 0.150901 0.155063
1 2019-01-02 Algeria 0.000863 0.928843 0.159910 0.171941
2 2019-01-02 Argentina 0.036617 0.931313 0.049550 0.068565
3 2019-01-02 Armenia 0.000490 0.746950 0.140541 0.131857
4 2019-01-02 Australia 0.047084 0.935778 0.050676 0.061181

Modelling - Using Isolation Forest for Anomaly detection

In [50]:
# Grouping the data by Country. Since Naibia and Guam has less than 2 points so I am commenting them.
country_vid_ctr = ctr_video[(ctr_video.Country != 'Namibia') & (ctr_video.Country != 'Guam')].reset_index().groupby('Country')
country_vid_ctr.head()
Out[50]:
index Date Country ad_type2_impressions ad_type2_videos_completed ad_type2_CTR ad_type1_CTR
0 0 2019-01-02 Albania 0.000033 0.927133 0.150901 0.155063
1 1 2019-01-02 Algeria 0.000863 0.928843 0.159910 0.171941
2 2 2019-01-02 Argentina 0.036617 0.931313 0.049550 0.068565
3 3 2019-01-02 Armenia 0.000490 0.746950 0.140541 0.131857
4 4 2019-01-02 Australia 0.047084 0.935778 0.050676 0.061181
... ... ... ... ... ... ... ...
11779 11793 2019-04-16 Aruba 0.000031 0.000000 0.061937 0.000000
11890 11904 2019-04-17 Aruba 0.000022 0.000000 0.072072 0.000000
11998 12013 2019-04-18 Aruba 0.000014 0.000000 0.083333 0.000000
12105 12121 2019-04-19 Aruba 0.000029 0.950029 0.055180 0.000000
12212 12228 2019-04-20 Aruba 0.000023 0.000000 0.049550 0.000000

633 rows × 7 columns

In [53]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.plotly as py
import matplotlib.pyplot as plt
from matplotlib import pyplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

def plot_anomaly(df,metric_name):
    df.load_date = pd.to_datetime(df['load_date'].astype(str), format="%Y-%m-%d")
    dates = df.load_date
    #identify the anomaly points and create a array of its values for plot
    bool_array = (abs(df['anomaly']) > 0)
    actuals = df["actuals"][-len(bool_array):]
    anomaly_points = bool_array * actuals
    anomaly_points[anomaly_points == 0] = np.nan
    #A dictionary for conditional format table based on anomaly
    color_map = {0: "aliceblue", 1: "yellow", 2: "red"}
    
    #Table which includes Date,Actuals,Change occured from previous point
    table = go.Table(
        domain=dict(x=[0, 1],
                    y=[0, 0.3]),
        columnwidth=[1, 2],
        
        header=dict(height=20,
                    values=[['<b>Date</b>'], ['<b>Actual Values </b>'], ['<b>% Change </b>'],['<b>Score</b>'],
                            ['<b>anomaly_class</b>']],
                    font=dict(color=['rgb(45, 45, 45)'] * 5, size=14),
                    fill=dict(color='#d562be')),
        cells=dict(values=[df.round(3)[k].tolist() for k in ['load_date', 'actuals', 'percentage_change','score','anomaly_class']],
                   line=dict(color='#506784'),
                   align=['center'] * 5,
                   font=dict(color=['rgb(40, 40, 40)'] * 5, size=12),
                   suffix=[None] + [''] + [''] + ['%'] + [''],
                   height=27,
                   fill=dict(color=[anomaly_df['anomaly_class'].map(color_map)],#map based on anomaly level from dictionary
                   )
                   ))
    #Plot the actuals points
    Actuals = go.Scatter(name='Actuals',
                         x=dates,
                         y=df['actuals'],
                         xaxis='x1', yaxis='y1',
                         mode='lines',
                         marker=dict(size=12,
                                     line=dict(width=1),
                                     color="blue"))
    #Highlight the anomaly points
    anomalies_map = go.Scatter(name="Anomaly",
                               showlegend=True,
                               x=dates,
                               y=anomaly_points,
                               mode='markers',
                               xaxis='x1',
                               yaxis='y1',
                               marker=dict(color="red",
                                           size=11,
                                           line=dict(
                                               color="red",
                                               width=2)))
    axis = dict(
            showline=True,
            zeroline=False,
            showgrid=True,
            mirror=True,
            ticklen=4,
            gridcolor='#ffffff',
            tickfont=dict(size=10))
    layout = dict(
            width=1000,
            height=865,
            autosize=False,
            title=metric_name,
            margin=dict(t=75),
            showlegend=True,
            xaxis1=dict(axis, **dict(domain=[0, 1], anchor='y1', showticklabels=True)),
            yaxis1=dict(axis, **dict(domain=[2 * 0.21 + 0.20, 1], anchor='x1', hoverformat='.2f')))
    fig = go.Figure(data=[table, anomalies_map, Actuals], layout=layout)
    iplot(fig)
    pyplot.show()


def classify_anomalies(df,metric_name):
    df['metric_name']=metric_name
    df = df.sort_values(by='load_date', ascending=False)
    #Shift actuals by one timestamp to find the percentage chage between current and previous data point
    df['shift'] = df['actuals'].shift(-1)
    df['percentage_change'] = ((df['actuals'] - df['shift']) / df['actuals']) * 100
    #Categorise anomalies as 0-no anomaly, 1- low anomaly , 2 - high anomaly
    df['anomaly'].loc[df['anomaly'] == 1] = 0
    df['anomaly'].loc[df['anomaly'] == -1] = 2
    df['anomaly_class'] = df['anomaly']
    max_anomaly_score = df['score'].loc[df['anomaly_class'] == 2].max()
    medium_percentile = df['score'].quantile(0.25)
    df['anomaly_class'].loc[(df['score'] > max_anomaly_score) & (df['score'] <= medium_percentile)] = 1
    return df
In [56]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D

for country in country_vid_ctr.groups:
    ctr = ctr_video[ctr_video.Country == country].reset_index()
    to_model_columns=['ad_type2_impressions','ad_type2_videos_completed','ad_type1_CTR','ad_type2_CTR']
    # Defining Isolation forest model
    clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.1), \
                            max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    # Fitting the data to the model
    clf.fit(ctr[to_model_columns])
    # Predicting Anomalies
    pred = clf.predict(ctr[to_model_columns])
    ctr['anomaly']=pred
    outliers=ctr.loc[ctr['anomaly']==-1]
    outlier_index=list(outliers.index)
    print(country)
    # Normalize and fit the metrics to a PCA to reduce the number of dimensions and then plot them in 3D highlighting
    # the anomalies to visualize anomalies to make sure these naomalies makes sense
    pca = PCA(n_components=3)  # Reduce to k=3 dimensions
    scaler = StandardScaler()
    #normalize the metrics
    X = scaler.fit_transform(ctr[to_model_columns])
    X_reduce = pca.fit_transform(X)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.set_zlabel("x_composite_3")
    # Plot the compressed data points
    ax.scatter(X_reduce[:, 0], X_reduce[:, 1], zs=X_reduce[:, 2], s=4, lw=1, label="inliers",c="green")
    # Plot x's for the ground truth outliers
    ax.scatter(X_reduce[outlier_index,0],X_reduce[outlier_index,1], X_reduce[outlier_index,2],
               lw=2, s=60, marker="x", c="red", label="outliers")
    ax.legend()
    plt.show()
    
    del ctr['index']

    clf.fit(ctr.iloc[:, 4:5])
    pred = clf.predict(ctr.iloc[:, 4:5])
    anomaly_df = pd.DataFrame()
    anomaly_df['load_date'] = ctr['Date']
    # Find decision function to find the score and classify anomalies
    anomaly_df['score'] = clf.decision_function(ctr.iloc[:, 4:5])
    anomaly_df['actuals'] = ctr.iloc[:, 4:5]
    anomaly_df['anomaly'] = pred
    # Get the indexes of outliers in order to compare the metrics  with use case anomalies if required
    outliers = anomaly_df.loc[anomaly_df['anomaly'] == -1]
    outlier_index = list(outliers.index)
    anomaly_df = classify_anomalies(anomaly_df, country)
    # Plotting Anomaly
    plot_anomaly(anomaly_df, country)
Albania
Algeria
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas, The
Bahrain
Bangladesh
Belarus
Belgium
Bolivia
Bosnia and Herzegovina
Brazil
Bulgaria
Cambodia
Cameroon
Canada
Chile
China
Colombia
Costa Rica
Cote d'Ivoire
Croatia
Cyprus
Czech Republic
Denmark
Dominican Republic
Ecuador
Egypt
El Salvador
Estonia
Finland
France
Georgia
Germany
Ghana
Greece
Guadeloupe
Guatemala
Honduras
Hong Kong
Hungary
Iceland
India
Indonesia
Iraq
Ireland
Israel
Italy
Jamaica
Japan
Jordan
Kazakhstan
Kenya
Korea, South
Kosovo
Kuwait
Laos
Latvia
Lebanon
Lithuania
Luxembourg
Macao
Macedonia
Malaysia
Malta
Martinique
Mauritius
Mexico
Moldova
Mongolia
Morocco
Mozambique
Myanmar (Burma)
Nepal
Netherlands
New Zealand
Nicaragua
Nigeria
Norway
Oman
Pakistan
Palestine
Panama
Paraguay
Peru
Philippines
Poland
Portugal
Puerto Rico
Qatar
Reunion
Romania
Russia
Saudi Arabia
Senegal
Serbia
Singapore
Slovakia
Slovenia
South Africa
Spain
Sri Lanka
Sweden
Switzerland
Syria
Taiwan
Tanzania
Thailand
Trinidad and Tobago
Tunisia
Turkey
Uganda
Ukraine
United Arab Emirates
United Kingdom
United States
Unknown
Uruguay
Uzbekistan
Venezuela
Vietnam
Zambia
Zimbabwe
In [ ]: